In [1]:
import pysal as ps
import numpy as np
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
f = ps.open(ps.examples.get_path('usjoin.csv'), 'r')

To determine what is in the file, check the header attribute on the file object:


In [3]:
f.header


Out[3]:
['Name',
 'STATE_FIPS',
 '1929',
 '1930',
 '1931',
 '1932',
 '1933',
 '1934',
 '1935',
 '1936',
 '1937',
 '1938',
 '1939',
 '1940',
 '1941',
 '1942',
 '1943',
 '1944',
 '1945',
 '1946',
 '1947',
 '1948',
 '1949',
 '1950',
 '1951',
 '1952',
 '1953',
 '1954',
 '1955',
 '1956',
 '1957',
 '1958',
 '1959',
 '1960',
 '1961',
 '1962',
 '1963',
 '1964',
 '1965',
 '1966',
 '1967',
 '1968',
 '1969',
 '1970',
 '1971',
 '1972',
 '1973',
 '1974',
 '1975',
 '1976',
 '1977',
 '1978',
 '1979',
 '1980',
 '1981',
 '1982',
 '1983',
 '1984',
 '1985',
 '1986',
 '1987',
 '1988',
 '1989',
 '1990',
 '1991',
 '1992',
 '1993',
 '1994',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009']

Ok, lets pull in the name variable to see what we have


In [4]:
name = f.by_col('Name')

In [5]:
name


Out[5]:
['Alabama',
 'Arizona',
 'Arkansas',
 'California',
 'Colorado',
 'Connecticut',
 'Delaware',
 'Florida',
 'Georgia',
 'Idaho',
 'Illinois',
 'Indiana',
 'Iowa',
 'Kansas',
 'Kentucky',
 'Louisiana',
 'Maine',
 'Maryland',
 'Massachusetts',
 'Michigan',
 'Minnesota',
 'Mississippi',
 'Missouri',
 'Montana',
 'Nebraska',
 'Nevada',
 'New Hampshire',
 'New Jersey',
 'New Mexico',
 'New York',
 'North Carolina',
 'North Dakota',
 'Ohio',
 'Oklahoma',
 'Oregon',
 'Pennsylvania',
 'Rhode Island',
 'South Carolina',
 'South Dakota',
 'Tennessee',
 'Texas',
 'Utah',
 'Vermont',
 'Virginia',
 'Washington',
 'West Virginia',
 'Wisconsin',
 'Wyoming']

Now obtain per capital incomes in 1929 which is in the column associated with 1929


In [6]:
y1929 = f.by_col('1929')

In [7]:
y1929


Out[7]:
[323,
 600,
 310,
 991,
 634,
 1024,
 1032,
 518,
 347,
 507,
 948,
 607,
 581,
 532,
 393,
 414,
 601,
 768,
 906,
 790,
 599,
 286,
 621,
 592,
 596,
 868,
 686,
 918,
 410,
 1152,
 332,
 382,
 771,
 455,
 668,
 772,
 874,
 271,
 426,
 378,
 479,
 551,
 634,
 434,
 741,
 460,
 673,
 675]

And now 2009


In [8]:
y2009 = f.by_col("2009")

In [9]:
y2009


Out[9]:
[32274,
 32077,
 31493,
 40902,
 40093,
 52736,
 40135,
 36565,
 33086,
 30987,
 40933,
 33174,
 35983,
 37036,
 31250,
 35151,
 35268,
 47159,
 49590,
 34280,
 40920,
 29318,
 35106,
 32699,
 37057,
 38009,
 41882,
 48123,
 32197,
 46844,
 33564,
 38672,
 35018,
 33708,
 35210,
 38827,
 41283,
 30835,
 36499,
 33512,
 35674,
 30107,
 36752,
 43211,
 40619,
 31843,
 35676,
 42504]

These are read into regular Python lists which are not particularly well suited to efficient data analysis. So let's convert them to numpy arrays


In [10]:
y2009 = np.array(y2009)

In [11]:
y2009


Out[11]:
array([32274, 32077, 31493, 40902, 40093, 52736, 40135, 36565, 33086,
       30987, 40933, 33174, 35983, 37036, 31250, 35151, 35268, 47159,
       49590, 34280, 40920, 29318, 35106, 32699, 37057, 38009, 41882,
       48123, 32197, 46844, 33564, 38672, 35018, 33708, 35210, 38827,
       41283, 30835, 36499, 33512, 35674, 30107, 36752, 43211, 40619,
       31843, 35676, 42504])

Much better. But pulling these in and converting them a column at a time is tedious and error prone. So we will do all of this in a list comprehension.


In [12]:
Y = np.array( [ f.by_col(str(year)) for year in range(1929,2010) ] ) * 1.0

In [13]:
Y.shape


Out[13]:
(81, 48)

In [14]:
Y = Y.transpose()

In [15]:
Y.shape


Out[15]:
(48, 81)

In [16]:
years = np.arange(1929,2010)

In [17]:
plot(years,Y[0])


Out[17]:
[<matplotlib.lines.Line2D at 0x7fef9eb2c290>]

In [18]:
RY = Y / Y.mean(axis=0)

In [19]:
plot(years,RY[0])


Out[19]:
[<matplotlib.lines.Line2D at 0x7fef9e5ab5d0>]

In [20]:
name = np.array(name)

In [21]:
np.nonzero(name=='Ohio')


Out[21]:
(array([32]),)

In [22]:
plot(years, RY[32], label='Ohio')
plot(years, RY[0], label='Alabama')
legend()


Out[22]:
<matplotlib.legend.Legend at 0x7fef9e4f9d90>

Spaghetti Plot


In [23]:
for row in RY:
    plot(years, row)


Kernel Density (univariate, aspatial)


In [24]:
from scipy.stats.kde import gaussian_kde

In [25]:
density = gaussian_kde(Y[:,0])

In [26]:
Y[:,0]


Out[26]:
array([  323.,   600.,   310.,   991.,   634.,  1024.,  1032.,   518.,
         347.,   507.,   948.,   607.,   581.,   532.,   393.,   414.,
         601.,   768.,   906.,   790.,   599.,   286.,   621.,   592.,
         596.,   868.,   686.,   918.,   410.,  1152.,   332.,   382.,
         771.,   455.,   668.,   772.,   874.,   271.,   426.,   378.,
         479.,   551.,   634.,   434.,   741.,   460.,   673.,   675.])

In [27]:
density = gaussian_kde(Y[:,0])

In [28]:
minY0 = Y[:,0].min()*.90
maxY0 = Y[:,0].max()*1.10
x = np.linspace(minY0, maxY0, 100)

In [29]:
plot(x,density(x))


Out[29]:
[<matplotlib.lines.Line2D at 0x7fef9e247290>]

In [30]:
d2009 = gaussian_kde(Y[:,-1])

In [31]:
minY0 = Y[:,-1].min()*.90
maxY0 = Y[:,-1].max()*1.10
x = np.linspace(minY0, maxY0, 100)

In [32]:
plot(x,d2009(x))


Out[32]:
[<matplotlib.lines.Line2D at 0x7fef9c985510>]

In [33]:
minR0 = RY.min()

In [34]:
maxR0 = RY.max()

In [35]:
x = np.linspace(minR0, maxR0, 100)

In [36]:
d1929 = gaussian_kde(RY[:,0])

In [37]:
d2009 = gaussian_kde(RY[:,-1])

In [38]:
plot(x, d1929(x))
plot(x, d2009(x))


Out[38]:
[<matplotlib.lines.Line2D at 0x7fef9c9bcb90>]

In [39]:
plot(x, d1929(x), label='1929')
plot(x, d2009(x), label='2009')
legend()


Out[39]:
<matplotlib.legend.Legend at 0x7fef9e2f55d0>

In [40]:
for cs in RY.T: # take cross sections
    plot(x, gaussian_kde(cs)(x))



In [41]:
sigma = RY.std(axis=0)
plot(years, sigma)
ylabel('s')
xlabel('year')
title("Sigma-Convergence")


Out[41]:
<matplotlib.text.Text at 0x7fef9c6cd310>

So the distribution is becoming less dispersed over time.

But what about internal mixing? Do poor (rich) states remain poor (rich), or is there movement within the distribuiton over time?

Markov Chains


In [42]:
c = np.array([['b','a','c'],
['c','c','a'],
['c','b','c'],
['a','a','b'],
['a','b','c']])

In [43]:
c


Out[43]:
array([['b', 'a', 'c'],
       ['c', 'c', 'a'],
       ['c', 'b', 'c'],
       ['a', 'a', 'b'],
       ['a', 'b', 'c']], 
      dtype='|S1')

In [44]:
m = ps.Markov(c)

In [45]:
m.classes


Out[45]:
array(['a', 'b', 'c'], 
      dtype='|S1')

In [46]:
m.transitions


Out[46]:
array([[ 1.,  2.,  1.],
       [ 1.,  0.,  2.],
       [ 1.,  1.,  1.]])

In [47]:
m.p


Out[47]:
matrix([[ 0.25      ,  0.5       ,  0.25      ],
        [ 0.33333333,  0.        ,  0.66666667],
        [ 0.33333333,  0.33333333,  0.33333333]])

State Per Capita Incomes


In [48]:
f = ps.open(ps.examples.get_path("usjoin.csv"))
pci = np.array([f.by_col[str(y)] for y in range(1929,2010)])
pci.shape


Out[48]:
(81, 48)

Put series into cross-sectional quintiles (i.e., quintiles for each year)


In [49]:
q5 = np.array([ps.Quantiles(y).yb for y in pci]).transpose()

In [50]:
q5.shape


Out[50]:
(48, 81)

In [51]:
q5[:,0]


Out[51]:
array([0, 2, 0, 4, 2, 4, 4, 1, 0, 1, 4, 2, 2, 1, 0, 1, 2, 3, 4, 4, 2, 0, 2,
       2, 2, 4, 3, 4, 0, 4, 0, 0, 3, 1, 3, 3, 4, 0, 1, 0, 1, 2, 2, 1, 3, 1,
       3, 3])

In [52]:
pci.shape


Out[52]:
(81, 48)

In [53]:
pci[0]


Out[53]:
array([ 323,  600,  310,  991,  634, 1024, 1032,  518,  347,  507,  948,
        607,  581,  532,  393,  414,  601,  768,  906,  790,  599,  286,
        621,  592,  596,  868,  686,  918,  410, 1152,  332,  382,  771,
        455,  668,  772,  874,  271,  426,  378,  479,  551,  634,  434,
        741,  460,  673,  675])

we are looping over the rows of y which is ordered Txn (rows are cross sections, row 0 is the cross-section for period 0


In [54]:
m5 = ps.Markov(q5)

In [55]:
m5.classes


Out[55]:
array([0, 1, 2, 3, 4])

In [56]:
m5.transitions


Out[56]:
array([[ 729.,   71.,    1.,    0.,    0.],
       [  72.,  567.,   80.,    3.,    0.],
       [   0.,   81.,  631.,   86.,    2.],
       [   0.,    3.,   86.,  573.,   56.],
       [   0.,    0.,    1.,   57.,  741.]])

In [57]:
m5.p


Out[57]:
matrix([[ 0.91011236,  0.0886392 ,  0.00124844,  0.        ,  0.        ],
        [ 0.09972299,  0.78531856,  0.11080332,  0.00415512,  0.        ],
        [ 0.        ,  0.10125   ,  0.78875   ,  0.1075    ,  0.0025    ],
        [ 0.        ,  0.00417827,  0.11977716,  0.79805014,  0.07799443],
        [ 0.        ,  0.        ,  0.00125156,  0.07133917,  0.92740926]])

In [58]:
m5.steady_state


Out[58]:
matrix([[ 0.20774716],
        [ 0.18725774],
        [ 0.20740537],
        [ 0.18821787],
        [ 0.20937187]])

In [59]:
fmpt = ps.ergodic.fmpt(m5.p)

In [60]:
fmpt


Out[60]:
matrix([[   4.81354357,   11.50292712,   29.60921231,   53.38594954,
          103.59816743],
        [  42.04774505,    5.34023324,   18.74455332,   42.50023268,
           92.71316899],
        [  69.25849753,   27.21075248,    4.82147603,   25.27184624,
           75.43305672],
        [  84.90689329,   42.85914824,   17.18082642,    5.31299186,
           51.60953369],
        [  98.41295543,   56.36521038,   30.66046735,   14.21158356,
            4.77619083]])

For a state with income in the first quintile, it takes on average 11.5 years for it to first enter the second quintile, 29.6 to get to the third quintile, 53.4 years to enter the fourth, and 103.6 years to reach the richest quintile.

But, this approach assumes the movement of a state in the income distribution is independent of the movement of its neighbors or the position of the neighbors in the distribution. Does spatial context matter?

Dynamics of Spatial Dependence

Read in a GAL file to construct our W


In [61]:
w = ps.open(ps.examples.get_path("states48.gal")).read()

w.n

w.transform = 'R'

In [62]:
mits = [ps.Moran(cs, w) for cs in RY.T]

In [63]:
res = np.array([(m.I, m.EI, m.p_sim, m.z_sim) for m in mits])

In [64]:
plot(years, res[:,0], label='I')
plot(years, res[:,1], label='E[I]')
title("Moran's I")
legend()


Out[64]:
<matplotlib.legend.Legend at 0x7fef9c529c10>

In [65]:
plot(years, res[:,-1])
ylim(0,7.0)
title('z-values, I')


Out[65]:
<matplotlib.text.Text at 0x7fef9c4d89d0>

Spatial Markov


In [66]:
pci.shape


Out[66]:
(81, 48)

In [67]:
pci = pci.T

In [68]:
pci.shape


Out[68]:
(48, 81)

In [69]:
rpci = pci / pci.mean(axis=0)

In [70]:
rpci[:,0]


Out[70]:
array([ 0.5250254 ,  0.97527938,  0.50389434,  1.61083644,  1.03054521,
        1.6644768 ,  1.67748053,  0.8419912 ,  0.56403657,  0.82411107,
        1.54094142,  0.98665764,  0.94439553,  0.86474771,  0.63880799,
        0.67294277,  0.97690484,  1.2483576 ,  1.47267186,  1.28411785,
        0.97365391,  0.46488317,  1.00941416,  0.96227565,  0.96877751,
        1.41090417,  1.11506942,  1.49217745,  0.66644091,  1.8725364 ,
        0.53965459,  0.62092787,  1.253234  ,  0.73958686,  1.08581104,
        1.25485946,  1.42065696,  0.44050119,  0.69244836,  0.61442601,
        0.77859804,  0.89563156,  1.03054521,  0.70545208,  1.20447003,
        0.74771419,  1.09393837,  1.0971893 ])

In [71]:
rpci[:,0].mean()


Out[71]:
0.99999999999999989

In [72]:
sm = ps.Spatial_Markov(rpci, w, fixed=True, k=5)

In [73]:
sm.p


Out[73]:
matrix([[ 0.91461837,  0.07503234,  0.00905563,  0.00129366,  0.        ],
        [ 0.06570302,  0.82654402,  0.10512484,  0.00131406,  0.00131406],
        [ 0.00520833,  0.10286458,  0.79427083,  0.09505208,  0.00260417],
        [ 0.        ,  0.00913838,  0.09399478,  0.84856397,  0.04830287],
        [ 0.        ,  0.        ,  0.        ,  0.06217617,  0.93782383]])

In [74]:
for p in sm.P:
    print p


[[ 0.96341463  0.0304878   0.00609756  0.          0.        ]
 [ 0.06040268  0.83221477  0.10738255  0.          0.        ]
 [ 0.          0.14        0.74        0.12        0.        ]
 [ 0.          0.03571429  0.32142857  0.57142857  0.07142857]
 [ 0.          0.          0.          0.16666667  0.83333333]]
[[ 0.79831933  0.16806723  0.03361345  0.          0.        ]
 [ 0.0754717   0.88207547  0.04245283  0.          0.        ]
 [ 0.00537634  0.06989247  0.8655914   0.05913978  0.        ]
 [ 0.          0.          0.06372549  0.90196078  0.03431373]
 [ 0.          0.          0.          0.19444444  0.80555556]]
[[ 0.84693878  0.15306122  0.          0.          0.        ]
 [ 0.08133971  0.78947368  0.1291866   0.          0.        ]
 [ 0.00518135  0.0984456   0.79274611  0.0984456   0.00518135]
 [ 0.          0.          0.09411765  0.87058824  0.03529412]
 [ 0.          0.          0.          0.10204082  0.89795918]]
[[ 0.8852459   0.09836066  0.          0.01639344  0.        ]
 [ 0.03875969  0.81395349  0.13953488  0.          0.00775194]
 [ 0.0049505   0.09405941  0.77722772  0.11881188  0.0049505 ]
 [ 0.          0.02339181  0.12865497  0.75438596  0.09356725]
 [ 0.          0.          0.          0.09661836  0.90338164]]
[[ 0.33333333  0.66666667  0.          0.          0.        ]
 [ 0.0483871   0.77419355  0.16129032  0.01612903  0.        ]
 [ 0.01149425  0.16091954  0.74712644  0.08045977  0.        ]
 [ 0.          0.01036269  0.06217617  0.89637306  0.03108808]
 [ 0.          0.          0.          0.02352941  0.97647059]]

In [75]:
sm.S


Out[75]:
array([[ 0.43509425,  0.2635327 ,  0.20363044,  0.06841983,  0.02932278],
       [ 0.13391287,  0.33993305,  0.25153036,  0.23343016,  0.04119356],
       [ 0.12124869,  0.21137444,  0.2635101 ,  0.29013417,  0.1137326 ],
       [ 0.0776413 ,  0.19748806,  0.25352636,  0.22480415,  0.24654013],
       [ 0.01776781,  0.19964349,  0.19009833,  0.25524697,  0.3372434 ]])

In [76]:
for f in sm.F:
    print f


[[   2.29835259   28.95614035   46.14285714   80.80952381  279.42857143]
 [  33.86549708    3.79459555   22.57142857   57.23809524  255.85714286]
 [  43.60233918    9.73684211    4.91085714   34.66666667  233.28571429]
 [  46.62865497   12.76315789    6.25714286   14.61564626  198.61904762]
 [  52.62865497   18.76315789   12.25714286    6.           34.1031746 ]]
[[   7.46754205    9.70574606   25.76785714   74.53116883  194.23446197]
 [  27.76691978    2.94175577   24.97142857   73.73474026  193.4380334 ]
 [  53.57477715   28.48447637    3.97566318   48.76331169  168.46660482]
 [  72.03631562   46.94601483   18.46153846    4.28393653  119.70329314]
 [  77.17917276   52.08887197   23.6043956     5.14285714   24.27564033]]
[[   8.24751154    6.53333333   18.38765432   40.70864198  112.76732026]
 [  47.35040872    4.73094099   11.85432099   34.17530864  106.23398693]
 [  69.42288828   24.76666667    3.794921     22.32098765   94.37966594]
 [  83.72288828   39.06666667   14.3           3.44668119   76.36702977]
 [  93.52288828   48.86666667   24.1           9.8           8.79255406]]
[[  12.87974382   13.34847151   19.83446328   28.47257282   55.82395142]
 [  99.46114206    5.06359731   10.54545198   23.05133495   49.68944423]
 [ 117.76777159   23.03735526    3.94436301   15.0843986    43.57927247]
 [ 127.89752089   32.4393006    14.56853107    4.44831643   31.63099455]
 [ 138.24752089   42.7893006    24.91853107   10.35          4.05613474]]
[[  56.2815534     1.5          10.57236842   27.02173913  110.54347826]
 [  82.9223301     5.00892857    9.07236842   25.52173913  109.04347826]
 [  97.17718447   19.53125       5.26043557   21.42391304  104.94565217]
 [ 127.1407767    48.74107143   33.29605263    3.91777427   83.52173913]
 [ 169.6407767    91.24107143   75.79605263   42.5           2.96521739]]

LISA Markov


In [77]:
lm = ps.LISA_Markov(pci,w)

In [78]:
lm.classes


Out[78]:
array([1, 2, 3, 4])

In [79]:
lm.transitions


Out[79]:
array([[  1.08700000e+03,   4.40000000e+01,   4.00000000e+00,
          3.40000000e+01],
       [  4.10000000e+01,   4.70000000e+02,   3.60000000e+01,
          1.00000000e+00],
       [  5.00000000e+00,   3.40000000e+01,   1.42200000e+03,
          3.90000000e+01],
       [  3.00000000e+01,   1.00000000e+00,   4.00000000e+01,
          5.52000000e+02]])

In [80]:
np.set_printoptions(3, suppress=True)

In [81]:
lm.transitions


Out[81]:
array([[ 1087.,    44.,     4.,    34.],
       [   41.,   470.,    36.,     1.],
       [    5.,    34.,  1422.,    39.],
       [   30.,     1.,    40.,   552.]])

In [82]:
lm.p


Out[82]:
matrix([[ 0.93 ,  0.038,  0.003,  0.029],
        [ 0.075,  0.858,  0.066,  0.002],
        [ 0.003,  0.023,  0.948,  0.026],
        [ 0.048,  0.002,  0.064,  0.886]])

In [83]:
lm.steady_state


Out[83]:
matrix([[ 0.286],
        [ 0.142],
        [ 0.405],
        [ 0.168]])

In [84]:
ps.ergodic.fmpt(lm.p)


Out[84]:
matrix([[  3.501,  37.93 ,  40.558,  43.174],
        [ 31.728,   7.047,  28.682,  49.915],
        [ 52.445,  47.421,   2.47 ,  43.756],
        [ 38.768,  51.518,  26.316,   5.969]])

Test of independence of own chains and lag chains


In [85]:
lm.transitions


Out[85]:
array([[ 1087.,    44.,     4.,    34.],
       [   41.,   470.,    36.,     1.],
       [    5.,    34.,  1422.,    39.],
       [   30.,     1.,    40.,   552.]])

In [86]:
lm.expected_t


Out[86]:
array([[ 1123.281,    11.538,     0.348,    33.834],
       [    3.503,   528.474,    15.918,     0.106],
       [    0.154,    23.216,  1466.907,     9.723],
       [    9.608,     0.099,     6.235,   607.058]])

In [87]:
lm.chi_2


Out[87]:
(1058.2079036003051, 0.0, 9)